In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors

plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")
Breast Cancer Wisconsin (Diagnostic) Dataset

In this article, we compare a number of classificaiton method for the breast cancer dataset. The details regarding this dataset can be found in Diagnostic Wisconsin Breast Cancer Database).

In this article, we compare a number of classification methods for the breast cancer dataset. We would use the following classification methods and then compare them in terms of performance.

Throughout this website, there are a large number of methods that discuss these methods. Here, we will not discuss these methods and only apply them. Interested readers are encouraged to see Statistical Learning.

In [2]:
data = load_breast_cancer()
df = pd.DataFrame(data['data'], columns = data['feature_names'])
Temp = [x.title() for x in data['target_names'].tolist()]
df['Target'] = data['target']
df['Diagnosis'] = df['Target'].map(lambda x: Temp[1] if x == 1 else Temp[0])
del Temp
df
Out[2]:
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension Target Diagnosis
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 ... 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890 0 Malignant
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 ... 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902 0 Malignant
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 ... 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758 0 Malignant
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 ... 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300 0 Malignant
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 ... 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678 0 Malignant
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 0.1726 0.05623 ... 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 0 Malignant
565 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 0.1752 0.05533 ... 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 0 Malignant
566 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 0.1590 0.05648 ... 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 0 Malignant
567 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 0.2397 0.07016 ... 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 0 Malignant
568 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 0.1587 0.05884 ... 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 1 Benign

569 rows × 32 columns

As can be seen, the number of instances is 569 and the number of attributes is 32. The object of the exercise is to create a classification model that can classify the type of Diagnosis base on the rest of the attributes. However, first, let's plot a count plot for Diagnosis attribute.

In [3]:
Temp = df.groupby(['Diagnosis'])['Diagnosis'].agg({'count'}).reset_index(drop = False).rename(columns ={'count': 'Count'})
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)

# display(Temp.style.hide_index())

fig = px.bar(Temp, y= 'Diagnosis', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
             height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Modeling

In [4]:
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
                   'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[4]:
Set X_train X_test y_train y_test
Shape (426, 30) (143, 30) (426,) (143,)

KNeighbors

See this link for more details.

In [5]:
# Neighbors List
n_neighbors_list = list(np.arange(1,11,1))

# Transforming X into a (weighted) graph of k nearest neighbors
graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode='distance')

# KNeighborsClassifier
classifier_model = KNeighborsClassifier(metric='precomputed')

# Making a pipline
full_model = Pipeline(steps=[('graph', graph_model), ('classifier', classifier_model)])

# Paramter Grid
param_grid = {'classifier__n_neighbors': n_neighbors_list}

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 6.5))
# Left
_ = ax[0].errorbar(x=n_neighbors_list,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='n_neighbors', title='Classification accuracy')
# Right
_ = ax[1].errorbar(x=n_neighbors_list,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()

del graph_model, classifier_model, full_model, param_grid, ax
Best Score Best Paramerers Accuracy
0.929400 {'classifier__n_neighbors': 8} 0.958000
rank_test_score params mean_test_score
1 {'classifier__n_neighbors': 8} 0.9294
2 {'classifier__n_neighbors': 10} 0.9271
3 {'classifier__n_neighbors': 7} 0.9247
4 {'classifier__n_neighbors': 6} 0.9224
5 {'classifier__n_neighbors': 9} 0.9224
6 {'classifier__n_neighbors': 3} 0.9201
7 {'classifier__n_neighbors': 5} 0.9154
8 {'classifier__n_neighbors': 2} 0.9131
9 {'classifier__n_neighbors': 4} 0.9107
10 {'classifier__n_neighbors': 1} 0.9060

Logistic Regression

See sklearn.linear_model.LogisticRegression for more details.

In [6]:
# regularization strength
Regularization_Strength = [10.0**x for x in range(4)]

# Inverse of regularization strength
C = [1/x for x in Regularization_Strength]

# Parameters
param_grid = {'tol': [10.0**x for x in np.arange(-2, -5, -1)], 'C': C,}

# Logistic Regression
logistic = LogisticRegression(max_iter=10000)

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(logistic, param_grid, n_jobs=-1)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, grid_model, logistic
Best Score Best Paramerers Accuracy
0.948300 {'C': 1.0, 'tol': 0.01} 0.972000
rank_test_score params mean_test_score
1 {'C': 1.0, 'tol': 0.01} 0.9483
1 {'C': 1.0, 'tol': 0.001} 0.9483
1 {'C': 1.0, 'tol': 0.0001} 0.9483
4 {'C': 0.1, 'tol': 0.01} 0.9436
4 {'C': 0.1, 'tol': 0.001} 0.9436
4 {'C': 0.1, 'tol': 0.0001} 0.9436
7 {'C': 0.01, 'tol': 0.01} 0.9343
7 {'C': 0.01, 'tol': 0.001} 0.9343
7 {'C': 0.01, 'tol': 0.0001} 0.9343
10 {'C': 0.001, 'tol': 0.01} 0.9271
10 {'C': 0.001, 'tol': 0.001} 0.9271
10 {'C': 0.001, 'tol': 0.0001} 0.9271

PCA with Logistic Regression

In [7]:
param_grid = {'pca__n_components': [2, 5, 10, 15, 25, 30], 'logistic__C': np.logspace(-4, 4, 4),}

# Logistic Regression
logistic = LogisticRegression(max_iter=10000, tol=0.1)

# Principle Component Analysis
pca = PCA()

# Making a pipline
full_model = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid, n_jobs=-1)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 10))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, full_model, grid_model, PCA, logistic
Best Score Best Paramerers Accuracy
0.964800 {'logistic__C': 21.54434690031882, 'pca__n_components': 25} 0.958000
rank_test_score params mean_test_score
1 {'logistic__C': 21.54434690031882, 'pca__n_components': 30} 0.9648
1 {'logistic__C': 21.54434690031882, 'pca__n_components': 25} 0.9648
3 {'logistic__C': 10000.0, 'pca__n_components': 15} 0.9624
4 {'logistic__C': 10000.0, 'pca__n_components': 30} 0.9601
5 {'logistic__C': 10000.0, 'pca__n_components': 10} 0.9601
5 {'logistic__C': 21.54434690031882, 'pca__n_components': 15} 0.9601
7 {'logistic__C': 21.54434690031882, 'pca__n_components': 10} 0.9554
7 {'logistic__C': 10000.0, 'pca__n_components': 25} 0.9554
9 {'logistic__C': 0.046415888336127774, 'pca__n_components': 5} 0.9437
10 {'logistic__C': 10000.0, 'pca__n_components': 5} 0.9413
10 {'logistic__C': 21.54434690031882, 'pca__n_components': 5} 0.9413
10 {'logistic__C': 0.046415888336127774, 'pca__n_components': 30} 0.9413
10 {'logistic__C': 0.046415888336127774, 'pca__n_components': 15} 0.9413
10 {'logistic__C': 0.046415888336127774, 'pca__n_components': 10} 0.9413
10 {'logistic__C': 0.046415888336127774, 'pca__n_components': 25} 0.9413
16 {'logistic__C': 21.54434690031882, 'pca__n_components': 2} 0.9130
16 {'logistic__C': 0.046415888336127774, 'pca__n_components': 2} 0.9130
16 {'logistic__C': 10000.0, 'pca__n_components': 2} 0.9130
19 {'logistic__C': 0.0001, 'pca__n_components': 30} 0.9106
19 {'logistic__C': 0.0001, 'pca__n_components': 25} 0.9106
19 {'logistic__C': 0.0001, 'pca__n_components': 15} 0.9106
19 {'logistic__C': 0.0001, 'pca__n_components': 10} 0.9106
19 {'logistic__C': 0.0001, 'pca__n_components': 5} 0.9106
24 {'logistic__C': 0.0001, 'pca__n_components': 2} 0.9083

Decision Tree Classifier

See sklearn.tree.DecisionTreeClassifier for more details.

In [8]:
# Parameters
param_grid = {'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)}

# Logistic Regression
dtc = DecisionTreeClassifier()

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(dtc, param_grid, n_jobs=-1)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, grid_model
Best Score Best Paramerers Accuracy
0.931900 {'criterion': 'entropy', 'max_depth': 3} 0.965000
rank_test_score params mean_test_score
1 {'criterion': 'entropy', 'max_depth': 3} 0.9319
1 {'criterion': 'entropy', 'max_depth': 12} 0.9319
3 {'criterion': 'gini', 'max_depth': 5} 0.9295
4 {'criterion': 'entropy', 'max_depth': 10} 0.9295
5 {'criterion': 'gini', 'max_depth': 13} 0.9272
6 {'criterion': 'entropy', 'max_depth': 9} 0.9272
6 {'criterion': 'entropy', 'max_depth': 5} 0.9272
8 {'criterion': 'gini', 'max_depth': 3} 0.9271
9 {'criterion': 'entropy', 'max_depth': 7} 0.9248
10 {'criterion': 'entropy', 'max_depth': 13} 0.9248
11 {'criterion': 'gini', 'max_depth': 7} 0.9225
12 {'criterion': 'gini', 'max_depth': 4} 0.9225
13 {'criterion': 'entropy', 'max_depth': 4} 0.9225
13 {'criterion': 'gini', 'max_depth': 2} 0.9225
15 {'criterion': 'entropy', 'max_depth': 2} 0.9224
16 {'criterion': 'entropy', 'max_depth': 6} 0.9201
17 {'criterion': 'entropy', 'max_depth': 8} 0.9201
17 {'criterion': 'gini', 'max_depth': 6} 0.9201
19 {'criterion': 'gini', 'max_depth': 9} 0.9178
19 {'criterion': 'entropy', 'max_depth': 11} 0.9178
21 {'criterion': 'gini', 'max_depth': 11} 0.9154
22 {'criterion': 'gini', 'max_depth': 10} 0.9132
23 {'criterion': 'gini', 'max_depth': 12} 0.9131
24 {'criterion': 'gini', 'max_depth': 8} 0.9108

Support Vector Machine

See this link for more details.

In [9]:
# Parameters
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }

# Support Vector Machine
svm = SVC(kernel='rbf', class_weight='balanced')

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(svm, param_grid)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, grid_model, svm
Best Score Best Paramerers Accuracy
0.922500 {'C': 1000.0, 'gamma': 0.0005} 0.923100
rank_test_score params mean_test_score
1 {'C': 1000.0, 'gamma': 0.0005} 0.9225
1 {'C': 50000.0, 'gamma': 0.0005} 0.9225
1 {'C': 10000.0, 'gamma': 0.0005} 0.9225
1 {'C': 100000.0, 'gamma': 0.0005} 0.9225
1 {'C': 5000.0, 'gamma': 0.0005} 0.9225
6 {'C': 1000.0, 'gamma': 0.0001} 0.9178
6 {'C': 10000.0, 'gamma': 0.0001} 0.9178
6 {'C': 100000.0, 'gamma': 0.0001} 0.9178
6 {'C': 5000.0, 'gamma': 0.0001} 0.9178
6 {'C': 50000.0, 'gamma': 0.0001} 0.9178
11 {'C': 10000.0, 'gamma': 0.005} 0.9107
11 {'C': 5000.0, 'gamma': 0.005} 0.9107
11 {'C': 50000.0, 'gamma': 0.005} 0.9107
11 {'C': 1000.0, 'gamma': 0.005} 0.9107
11 {'C': 100000.0, 'gamma': 0.005} 0.9107
16 {'C': 50000.0, 'gamma': 0.001} 0.9037
16 {'C': 100000.0, 'gamma': 0.001} 0.9037
16 {'C': 10000.0, 'gamma': 0.001} 0.9037
16 {'C': 5000.0, 'gamma': 0.001} 0.9037
16 {'C': 1000.0, 'gamma': 0.001} 0.9037
21 {'C': 10000.0, 'gamma': 0.01} 0.6338
21 {'C': 100000.0, 'gamma': 0.01} 0.6338
21 {'C': 5000.0, 'gamma': 0.01} 0.6338
21 {'C': 50000.0, 'gamma': 0.01} 0.6338
21 {'C': 1000.0, 'gamma': 0.01} 0.6338
26 {'C': 5000.0, 'gamma': 0.1} 0.6291
26 {'C': 50000.0, 'gamma': 0.1} 0.6291
26 {'C': 1000.0, 'gamma': 0.1} 0.6291
26 {'C': 10000.0, 'gamma': 0.1} 0.6291
26 {'C': 100000.0, 'gamma': 0.1} 0.6291

Random Forest Classifier

A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. See sklearn.ensemble.RandomForestClassifier for more details.

In [10]:
# Parameters
param_grid = {'n_estimators': [n*100 for n in [2**m for m in np.arange(0,2)]],
              'max_depth': list(np.arange(2,4)),
              'min_samples_leaf': [10.0**x for x in np.arange(-1,-4,-1)]},

# Random Forest Classifier
rfc = RandomForestClassifier()

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(rfc, param_grid)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, grid_model, rfc
Best Score Best Paramerers Accuracy
0.950800 {'max_depth': 3, 'min_samples_leaf': 0.001, 'n_estimators': 200} 0.965000
rank_test_score params mean_test_score
1 {'max_depth': 3, 'min_samples_leaf': 0.001, 'n_estimators': 200} 0.9508
2 {'max_depth': 3, 'min_samples_leaf': 0.001, 'n_estimators': 100} 0.9460
3 {'max_depth': 3, 'min_samples_leaf': 0.01, 'n_estimators': 100} 0.9460
4 {'max_depth': 3, 'min_samples_leaf': 0.01, 'n_estimators': 200} 0.9437
5 {'max_depth': 2, 'min_samples_leaf': 0.01, 'n_estimators': 200} 0.9413
6 {'max_depth': 2, 'min_samples_leaf': 0.001, 'n_estimators': 100} 0.9390
7 {'max_depth': 2, 'min_samples_leaf': 0.001, 'n_estimators': 200} 0.9366
8 {'max_depth': 2, 'min_samples_leaf': 0.01, 'n_estimators': 100} 0.9366
9 {'max_depth': 3, 'min_samples_leaf': 0.1, 'n_estimators': 100} 0.9295
10 {'max_depth': 2, 'min_samples_leaf': 0.1, 'n_estimators': 100} 0.9248
11 {'max_depth': 2, 'min_samples_leaf': 0.1, 'n_estimators': 200} 0.9248
12 {'max_depth': 3, 'min_samples_leaf': 0.1, 'n_estimators': 200} 0.9154

Gradient Boosting Classifier

Gradient Boosting Classifier builds the model in a stage-wise fashion and it generalizes them by allowing optimization of an arbitrary differentiable loss function [Source]. See sklearn.ensemble.GradientBoostingClassifier for more details.

In [11]:
# Parameters
param_grid = {'loss': ['deviance', 'exponential'],
              'learning_rate': [0.1, 0.2, 0.3],
              'n_estimators': [100, 200],
              'subsample': [0.5, 1.0]}

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(gbc, param_grid, n_jobs=-1)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, grid_model, gbc
Best Score Best Paramerers Accuracy
0.971800 {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} 0.965000
rank_test_score params mean_test_score
1 {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} 0.9718
2 {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 0.5} 0.9694
3 {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 0.5} 0.9694
4 {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 0.5} 0.9671
5 {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} 0.9671
6 {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 0.5} 0.9671
6 {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 0.5} 0.9671
8 {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 0.5} 0.9647
9 {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} 0.9647
9 {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 0.5} 0.9647
11 {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 0.5} 0.9624
12 {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 0.5} 0.9624
13 {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 1.0} 0.9601
13 {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 1.0} 0.9601
15 {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 1.0} 0.9577
15 {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 1.0} 0.9577
17 {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 1.0} 0.9530
18 {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 1.0} 0.9530
18 {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 1.0} 0.9530
20 {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 1.0} 0.9507
20 {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 1.0} 0.9507
20 {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 1.0} 0.9507
20 {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 1.0} 0.9507
24 {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 1.0} 0.9483

Multi-layer Perceptron Classifier (Neural Network)

This model optimizes the log-loss function using LBFGS or stochastic gradient descent. See sklearn.neural_network.MLPClassifier.

In [12]:
# Parameters
param_grid = {'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [10.0**x for x in np.arange(-1,-4,-1)],
              'learning_rate' : ['constant', 'invscaling', 'adaptive']}

# Multi-layer Perceptron classifier
mlp = MLPClassifier(max_iter = 1000)

# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(mlp, param_grid, n_jobs=-1)

# Fitting
_ = grid_model.fit(X_train, y_train)

display(pd.DataFrame({'Best Score': [grid_model.best_score_],
                      'Best Paramerers': [str(grid_model.best_params_)],
                      'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())

display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
                           'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
        .background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))

Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]

# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_test_score'],
                   yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
                   y=grid_model.cv_results_['mean_fit_time'],
                   yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()

del Temp, grid_model, xgb
Best Score Best Paramerers Accuracy
0.936500 {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'adam'} 0.958000
rank_test_score params mean_test_score
1 {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'adam'} 0.9365
2 {'alpha': 0.1, 'learning_rate': 'invscaling', 'solver': 'adam'} 0.9341
3 {'alpha': 0.1, 'learning_rate': 'adaptive', 'solver': 'adam'} 0.9318
4 {'alpha': 0.01, 'learning_rate': 'adaptive', 'solver': 'adam'} 0.9318
5 {'alpha': 0.1, 'learning_rate': 'constant', 'solver': 'adam'} 0.9272
6 {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'adam'} 0.9271
7 {'alpha': 0.01, 'learning_rate': 'constant', 'solver': 'adam'} 0.9059
8 {'alpha': 0.01, 'learning_rate': 'invscaling', 'solver': 'adam'} 0.8575
9 {'alpha': 0.001, 'learning_rate': 'adaptive', 'solver': 'adam'} 0.8518
10 {'alpha': 0.01, 'learning_rate': 'adaptive', 'solver': 'sgd'} 0.8494
11 {'alpha': 0.1, 'learning_rate': 'invscaling', 'solver': 'sgd'} 0.8354
12 {'alpha': 0.01, 'learning_rate': 'constant', 'solver': 'sgd'} 0.8103
13 {'alpha': 0.01, 'learning_rate': 'invscaling', 'solver': 'sgd'} 0.7789
14 {'alpha': 0.1, 'learning_rate': 'adaptive', 'solver': 'sgd'} 0.7679
15 {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'sgd'} 0.7060
16 {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'sgd'} 0.7021
17 {'alpha': 0.1, 'learning_rate': 'constant', 'solver': 'sgd'} 0.6686
18 {'alpha': 0.001, 'learning_rate': 'adaptive', 'solver': 'sgd'} 0.6627
19 {'alpha': 0.01, 'learning_rate': 'invscaling', 'solver': 'lbfgs'} 0.4674
20 {'alpha': 0.001, 'learning_rate': 'adaptive', 'solver': 'lbfgs'} 0.3965
21 {'alpha': 0.01, 'learning_rate': 'constant', 'solver': 'lbfgs'} 0.3709
21 {'alpha': 0.01, 'learning_rate': 'adaptive', 'solver': 'lbfgs'} 0.3709
21 {'alpha': 0.1, 'learning_rate': 'adaptive', 'solver': 'lbfgs'} 0.3709
21 {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'lbfgs'} 0.3709
21 {'alpha': 0.1, 'learning_rate': 'invscaling', 'solver': 'lbfgs'} 0.3709
21 {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'lbfgs'} 0.3709
21 {'alpha': 0.1, 'learning_rate': 'constant', 'solver': 'lbfgs'} 0.3709
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-e05513689987> in <module>
     42 fig.tight_layout()
     43 
---> 44 del Temp, grid_model, xgb

NameError: name 'xgb' is not defined

Final Words

it seems that Gradient Boosting Classifier performing slightly better than the rest of the classification method in this study. All of these classification methods are tuned in a way that performs at their best by implementing GridSearchCV.